import os
import base64
from datetime import datetime
import shutil
from openai import OpenAI
import json

# Load the API key from an environment variable
api_key = "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"
if not api_key:
    raise ValueError("No OpenAI API key found. Please set the OPENAI_API_KEY environment variable.")

# Initialize the OpenAI client
client = OpenAI(api_key=api_key)


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def process_command(user_command, image_path, elements, current_state, action_history, template=None):
    # Save a copy of the image
    save_dir = os.path.join(os.path.dirname(__file__), 'sent_to_gpt')
    os.makedirs(save_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    save_path = os.path.join(save_dir, f"sent_image_{timestamp}.png")
    shutil.copy(image_path, save_path)

    base64_image = encode_image(image_path)

    elements_description = "\n".join([f"Element {e['id']}: Bounding box {e['bbox']}" for e in elements])

    template_content = f"\nTemplate instructions:\n{template}" if template else ""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": f"""
                You are an AI assistant that analyzes screenshots and user commands to determine appropriate actions. 
                The screenshot has been processed to identify potential clickable elements, which are numbered and outlined in red.
                Your task is to interpret the user's command, the screenshot, and the current state to determine the next 1-3 actions to take.

                Available elements:
                {elements_description}

                Current user command: {user_command}
                Current state: {current_state}
                Action history: {action_history}
                Helpful context/task template: {template_content}

                Provide a JSON-formatted string containing an array of 1-3 actions. Each action should have the following fields: 
                action (e.g., 'click', 'type', 'hotkey', 'approximate_click'), target (the element number to interact with, null, or text to click on), and value 
                (the text to type, the hotkey combination, or null if not applicable).

                Available actions:
                - click: Click on a specific numbered element. Set target to the element number and value to null.
                - type: Type text. If typing into a specific element, set target to the element number; otherwise set target to null. Set value to the text to type.
                - hotkey: Perform a keyboard shortcut. Set target to null and value to the key combination (e.g., "ctrl+t" for new tab, "enter" for submit).
                - approximate_click: Click on text that isn't associated with a numbered element. Set target to the text to click on and value to null.

                Tips:
                1. When you see a form, aim for the placeholder text.
                2. If you are in a list of results, choose the most relevant result.

                Important: 
                1. If the required elements or website are not visible in the current screenshot, take proactive steps to navigate to the correct page or open a new tab.
                2. For tasks involving logging into websites, always start by opening a new tab and navigating to the website's login page if it's not already visible.
                3. Provide specific, actionable steps. Do not suggest manual actions or ask for more information unless absolutely necessary.
                4. After suggesting actions, indicate whether the task is complete or if further steps will be needed.

                Always provide at least one action unless the task is explicitly complete.
                """
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"What should I do next to accomplish the task: {user_command}"},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
                ]
            }
        ],
        max_tokens=800,
        timeout=120
    )

    content = response.choices[0].message.content
    print("Raw GPT response:", content)  # Debug print

    # Extract JSON from the response
    json_start = content.find('[')
    json_end = content.rfind(']') + 1
    if json_start != -1 and json_end != -1:
        json_str = content[json_start:json_end]
        try:
            # Use a custom JSON decoder to convert null to None
            actions = json.loads(json_str, parse_constant=lambda x: None if x.lower() == 'null' else x)
        except json.JSONDecodeError as e:
            print(f"JSON decode error: {e}")
            actions = []
    else:
        print("No JSON found in the response")
        actions = []

    if not actions:
        print("No actions provided, generating a default action")
        actions = [{"action": "type", "target": None, "value": "No specific action could be determined. Please provide more context or a clearer command."}]

    print("Parsed actions:", actions)  # Debug print

    task_complete = "task complete" in content.lower() or "task is complete" in content.lower()

    return {
        "thoughts": content,
        "actions": actions,
        "task_complete": task_complete
    }